library(ggplot2)
library(rafalib)
df <- read.table("~/data/molly/modified_bases.6mA.bed")
colnames(df) <- c("chrom", "start", "end", "name", "score", "strand", "startCodon", "stopCodon", "color", "coverage", "percentage")
hist(df$coverage, breaks=400, xlim=c(0,1000))
df <- df[df$coverage > 10,]
plot(df$percentage[1:1e4], pch=16, cex=1/2)
dfHML <- df[df$chrom == "III" & df$start > 1 & df$end < 2.2e4,]
plot(x=dfHML$start, y=dfHML$percentage, pch=16,main="HML region", cex=dfHML$coverage/100,
xlab="Position on chr3", ylab="Percentage of methylated reads")
dfHMR <- df[df$chrom == "III" & df$start > 285e3 & df$end < 300e3,]
plot(x=dfHMR$start, y=dfHMR$percentage, pch=16, main="HMR region", cex=dfHMR$coverage/100,
xlab="Position on chr3", ylab="Percentage of methylated reads")
dfHMR2 <- df[df$chrom == "III" & df$start > 293000 & df$end < 294000,]
plot(x=dfHMR2$start, y=dfHMR2$percentage, pch=16, main="HMR region", cex=dfHMR2$coverage/100,
xlab="Position on chr3", ylab="Percentage of methylated reads")
nucDat <- openxlsx::read.xlsx("../../data/HMR_HML_nucleosome positions.xlsx")
nucDatHMR <- nucDat[nucDat$region == "HMR",]
dfHMRNuc <- dfHMR[dfHMR$start > min(nucDatHMR$Start) & dfHMR$stop < max(nucDatHMR$Stop),]
plot(x=dfHMRNuc$start, y=dfHMRNuc$percentage, cex=log(dfHMRNuc$coverage)/10, pch=16)
abline(v=nucDatHMR$dyad, col="red")
pListHMR <- list()
for(nn in 1:(nrow(nucDatHMR)-1)){
curStart <- nucDatHMR$dyad[nn]
curStop <- nucDatHMR$dyad[nn+1]
curDf <- dfHMRNuc[dfHMRNuc$start>=curStart & dfHMRNuc$end<=curStop,]
curDf$normPos <- (curDf$start - min(curDf$start))
curDf$normPos <- curDf$normPos / max(curDf$normPos)
pListHMR[[nn]] <- ggplot(curDf, aes(x=normPos, y=percentage)) +
geom_point(col="grey", alpha=.4) +
geom_smooth(aes(weight=coverage)) +
theme_bw() +
ggtitle(paste0("chrIII: ",round(min(curDf$start)/1e3,2),"k-",round(max(curDf$stop)/1e3,2),"k")) +
xlab("Dyad distance") +
ylab("Percentage methylated")
}
cowplot::plot_grid(plotlist=pListHMR)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
ggsave("../../plots/dyadMethylationHMR_Doudna.pdf",
width=16, height=10)
##HML
nucDatHML <- nucDat[nucDat$region == "HML",]
dfHMLNuc <- dfHML[dfHML$start > min(nucDatHML$Start) & dfHML$stop < max(nucDatHML$Stop),]
plot(x=dfHMLNuc$start, y=dfHMLNuc$percentage, cex=log(dfHMLNuc$coverage)/10, pch=16)
abline(v=nucDatHML$dyad, col="red")
pListHML <- list()
for(nn in 1:(nrow(nucDatHML)-1)){
curStart <- nucDatHML$dyad[nn]
curStop <- nucDatHML$dyad[nn+1]
curDf <- dfHMLNuc[dfHMLNuc$start>=curStart & dfHMLNuc$end<=curStop,]
curDf$normPos <- (curDf$start - min(curDf$start))
curDf$normPos <- curDf$normPos / max(curDf$normPos)
pListHML[[nn]] <- ggplot(curDf, aes(x=normPos, y=percentage)) +
geom_point(col="grey", alpha=.4) +
geom_smooth(aes(weight=coverage)) +
theme_bw() +
ggtitle(paste0("chrIII: ",round(min(curDf$start)/1e3,2),"k-",round(max(curDf$stop)/1e3,2),"k")) +
xlab("Dyad distance") +
ylab("Percentage methylated")
}
## Warning in min(curDf$start): no non-missing arguments to min; returning Inf
## Warning in max(curDf$normPos): no non-missing arguments to max; returning -Inf
## Warning in min(curDf$start): no non-missing arguments to min; returning Inf
## Warning in max(curDf$stop): no non-missing arguments to max; returning -Inf
cowplot::plot_grid(plotlist=pListHML)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
ggsave("../../plots/dyadMethylationHML_Doudna.pdf",
width=16, height=10)
dfTurkey <- read.table("~/data/molly/megalodon_output_06/modified_bases.aggregate06.6mA.bed")
colnames(dfTurkey) <- c("chrom", "start", "end", "name", "score", "strand", "startCodon", "stopCodon", "color", "coverage", "percentage")
dfHMRNucTurkey <- dfTurkey[dfTurkey$chrom == "III" & dfTurkey$start > min(nucDatHMR$Start) & dfTurkey$stop < max(nucDatHMR$Stop),]
plot(x=dfHMRNucTurkey$start, y=dfHMRNucTurkey$percentage, cex=log(dfHMRNucTurkey$coverage)/10, pch=16)
abline(v=nucDatHMR$dyad, col="red")
pListHMRTurkey <- list()
for(nn in 1:(nrow(nucDatHMR)-1)){
curStart <- nucDatHMR$dyad[nn]
curStop <- nucDatHMR$dyad[nn+1]
curDf <- dfHMRNucTurkey[dfHMRNucTurkey$start>=curStart & dfHMRNucTurkey$end<=curStop,]
curDf$normPos <- (curDf$start - min(curDf$start))
curDf$normPos <- curDf$normPos / max(curDf$normPos)
pListHMRTurkey[[nn]] <- ggplot(curDf, aes(x=normPos, y=percentage)) +
geom_point(col="grey", alpha=.4) +
geom_smooth(aes(weight=coverage)) +
theme_bw() +
ggtitle(paste0("chrIII: ",round(min(curDf$start)/1e3,2),"k-",round(max(curDf$stop)/1e3,2),"k")) +
xlab("Dyad distance") +
ylab("Percentage methylated")
}
cowplot::plot_grid(plotlist=pListHMRTurkey)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning -
## Inf
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning -
## Inf
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning -
## Inf
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning -
## Inf
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning -
## Inf
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning -
## Inf
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning -
## Inf
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning -
## Inf
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning -
## Inf
ggsave("../../plots/dyadMethylationHMR_turkey06.pdf",
width=16, height=10)
dfHMLNucTurkey <- dfTurkey[dfTurkey$chrom == "III" & dfTurkey$start > min(nucDatHML$Start) & dfTurkey$stop < max(nucDatHML$Stop),]
plot(x=dfHMLNucTurkey$start, y=dfHMLNucTurkey$percentage, cex=log(dfHMLNucTurkey$coverage)/10, pch=16)
abline(v=nucDatHMR$dyad, col="red")
pListHMLTurkey <- list()
for(nn in 1:(nrow(nucDatHML)-1)){
curStart <- nucDatHML$dyad[nn]
curStop <- nucDatHML$dyad[nn+1]
curDf <- dfHMLNucTurkey[dfHMLNucTurkey$start>=curStart & dfHMLNucTurkey$end<=curStop,]
curDf$normPos <- (curDf$start - min(curDf$start))
curDf$normPos <- curDf$normPos / max(curDf$normPos)
pListHMLTurkey[[nn]] <- ggplot(curDf, aes(x=normPos, y=percentage)) +
geom_point(col="grey", alpha=.4) +
geom_smooth(aes(weight=coverage)) +
theme_bw() +
ggtitle(paste0("chrIII: ",round(min(curDf$start)/1e3,2),"k-",round(max(curDf$stop)/1e3,2),"k")) +
xlab("Dyad distance") +
ylab("Percentage methylated")
}
## Warning in min(curDf$start): no non-missing arguments to min; returning Inf
## Warning in max(curDf$normPos): no non-missing arguments to max; returning -Inf
## Warning in min(curDf$start): no non-missing arguments to min; returning Inf
## Warning in max(curDf$stop): no non-missing arguments to max; returning -Inf
cowplot::plot_grid(plotlist=pListHMLTurkey)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning -
## Inf
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning -
## Inf
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
ggsave("../../plots/dyadMethylationHML_turkey06.pdf",
width=16, height=10)
dfTurkey <- read.table("~/data/molly/megalodon_output_07/modified_bases.aggregate07.6mA.bed")
colnames(dfTurkey) <- c("chrom", "start", "end", "name", "score", "strand", "startCodon", "stopCodon", "color", "coverage", "percentage")
dfHMRNucTurkey <- dfTurkey[dfTurkey$chrom == "III" & dfTurkey$start > min(nucDatHMR$Start) & dfTurkey$stop < max(nucDatHMR$Stop),]
plot(x=dfHMRNucTurkey$start, y=dfHMRNucTurkey$percentage, cex=log(dfHMRNucTurkey$coverage)/10, pch=16)
abline(v=nucDatHMR$dyad, col="red")
pListHMRTurkey <- list()
for(nn in 1:(nrow(nucDatHMR)-1)){
curStart <- nucDatHMR$dyad[nn]
curStop <- nucDatHMR$dyad[nn+1]
curDf <- dfHMRNucTurkey[dfHMRNucTurkey$start>=curStart & dfHMRNucTurkey$end<=curStop,]
curDf$normPos <- (curDf$start - min(curDf$start))
curDf$normPos <- curDf$normPos / max(curDf$normPos)
pListHMRTurkey[[nn]] <- ggplot(curDf, aes(x=normPos, y=percentage)) +
geom_point(col="grey", alpha=.4) +
geom_smooth(aes(weight=coverage)) +
theme_bw() +
ggtitle(paste0("chrIII: ",round(min(curDf$start)/1e3,2),"k-",round(max(curDf$stop)/1e3,2),"k")) +
xlab("Dyad distance") +
ylab("Percentage methylated")
}
cowplot::plot_grid(plotlist=pListHMRTurkey)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning -
## Inf
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning -
## Inf
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning -
## Inf
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning -
## Inf
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning -
## Inf
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning -
## Inf
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning -
## Inf
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning -
## Inf
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning -
## Inf
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning -
## Inf
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning -
## Inf
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning -
## Inf
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning -
## Inf
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning -
## Inf
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning -
## Inf
ggsave("../../plots/dyadMethylationHMR_turkey07.pdf",
width=16, height=10)
dfHMLNucTurkey <- dfTurkey[dfTurkey$chrom == "III" & dfTurkey$start > min(nucDatHML$Start) & dfTurkey$stop < max(nucDatHML$Stop),]
plot(x=dfHMLNucTurkey$start, y=dfHMLNucTurkey$percentage, cex=log(dfHMLNucTurkey$coverage)/10, pch=16)
abline(v=nucDatHMR$dyad, col="red")
pListHMLTurkey <- list()
for(nn in 1:(nrow(nucDatHML)-1)){
curStart <- nucDatHML$dyad[nn]
curStop <- nucDatHML$dyad[nn+1]
curDf <- dfHMLNucTurkey[dfHMLNucTurkey$start>=curStart & dfHMLNucTurkey$end<=curStop,]
curDf$normPos <- (curDf$start - min(curDf$start))
curDf$normPos <- curDf$normPos / max(curDf$normPos)
pListHMLTurkey[[nn]] <- ggplot(curDf, aes(x=normPos, y=percentage)) +
geom_point(col="grey", alpha=.4) +
geom_smooth(aes(weight=coverage)) +
theme_bw() +
ggtitle(paste0("chrIII: ",round(min(curDf$start)/1e3,2),"k-",round(max(curDf$stop)/1e3,2),"k")) +
xlab("Dyad distance") +
ylab("Percentage methylated")
}
## Warning in min(curDf$start): no non-missing arguments to min; returning Inf
## Warning in max(curDf$normPos): no non-missing arguments to max; returning -Inf
## Warning in min(curDf$start): no non-missing arguments to min; returning Inf
## Warning in max(curDf$stop): no non-missing arguments to max; returning -Inf
cowplot::plot_grid(plotlist=pListHMLTurkey)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning -
## Inf
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning -
## Inf
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning -
## Inf
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning -
## Inf
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning -
## Inf
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning -
## Inf
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning -
## Inf
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning -
## Inf
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning -
## Inf
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning -
## Inf
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning -
## Inf
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning -
## Inf
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning -
## Inf
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning -
## Inf
ggsave("../../plots/dyadMethylationHML_turkey07.pdf",
width=16, height=10)
library(Rsamtools)
## Loading required package: GenomeInfoDb
## Loading required package: BiocGenerics
## Loading required package: parallel
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
##
## clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
## clusterExport, clusterMap, parApply, parCapply, parLapply,
## parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, append, as.data.frame, basename, cbind, colnames,
## dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
## grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
## order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
## rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
## union, unique, unsplit, which, which.max, which.min
## Loading required package: S4Vectors
## Loading required package: stats4
##
## Attaching package: 'S4Vectors'
## The following object is masked from 'package:base':
##
## expand.grid
## Loading required package: IRanges
## Loading required package: GenomicRanges
## Loading required package: Biostrings
## Loading required package: XVector
##
## Attaching package: 'Biostrings'
## The following object is masked from 'package:base':
##
## strsplit
# file below is the mod_mappings.bam file on Savio, sorted using 'samtools sort'
bamPath <- "~/data/molly/mod_mappings.sorted.bam"
bamFile <- BamFile(bamPath)
bamFile
## class: BamFile
## path: /Users/koenvandenberge/data/molly/mod_mappings.sorted.bam
## index: /Users/koenvandenberge/data/molly/mod_mappings.sorted.bam.bai
## isOpen: FALSE
## yieldSize: NA
## obeyQname: FALSE
## asMates: FALSE
## qnamePrefixEnd: NA
## qnameSuffixStart: NA
scanBamHeader(bamFile)
## $targets
## I II III IV IX MT V VI VII VIII
## 230218 813184 316620 1531933 439888 85779 576874 270161 1090940 562643
## X XI XII XIII XIV XV XVI
## 745751 666816 1078177 924431 784333 1091291 948066
##
## $text
## $text$`@HD`
## [1] "VN:1.4" "SO:coordinate"
##
## $text$`@SQ`
## [1] "SN:I" "LN:230218"
##
## $text$`@SQ`
## [1] "SN:II" "LN:813184"
##
## $text$`@SQ`
## [1] "SN:III" "LN:316620"
##
## $text$`@SQ`
## [1] "SN:IV" "LN:1531933"
##
## $text$`@SQ`
## [1] "SN:IX" "LN:439888"
##
## $text$`@SQ`
## [1] "SN:MT" "LN:85779"
##
## $text$`@SQ`
## [1] "SN:V" "LN:576874"
##
## $text$`@SQ`
## [1] "SN:VI" "LN:270161"
##
## $text$`@SQ`
## [1] "SN:VII" "LN:1090940"
##
## $text$`@SQ`
## [1] "SN:VIII" "LN:562643"
##
## $text$`@SQ`
## [1] "SN:X" "LN:745751"
##
## $text$`@SQ`
## [1] "SN:XI" "LN:666816"
##
## $text$`@SQ`
## [1] "SN:XII" "LN:1078177"
##
## $text$`@SQ`
## [1] "SN:XIII" "LN:924431"
##
## $text$`@SQ`
## [1] "SN:XIV" "LN:784333"
##
## $text$`@SQ`
## [1] "SN:XV" "LN:1091291"
##
## $text$`@SQ`
## [1] "SN:XVI" "LN:948066"
##
## $text$`@RG`
## [1] "ID:1" "SM:SAMPLE"
seqinfo(bamFile)
## Seqinfo object with 17 sequences from an unspecified genome:
## seqnames seqlengths isCircular genome
## I 230218 <NA> <NA>
## II 813184 <NA> <NA>
## III 316620 <NA> <NA>
## IV 1531933 <NA> <NA>
## IX 439888 <NA> <NA>
## ... ... ... ...
## XII 1078177 <NA> <NA>
## XIII 924431 <NA> <NA>
## XIV 784333 <NA> <NA>
## XV 1091291 <NA> <NA>
## XVI 948066 <NA> <NA>
grHMR <- GRanges(seqnames = "III",
ranges = IRanges(start = 285e3, end = 300e3))
sbp <- ScanBamParam(which = grHMR, what = c("qname", # read ID
"seq", # sequence
"pos")) # read position (start)
aln <- scanBam(bamFile, param = sbp)
customParam <- ScanBamParam(which = grHMR,
tag=c("Mm", "Ml"),
what=c("flag", #custom tag (methylation)
"qname", # read ID
"seq", # sequence
"pos")) #start position
aln <- scanBam(bamFile, which=grHMR, param = customParam)
bases <- aln$`III:285000-300000`$seq
methylTag <- aln$`III:285000-300000`$tag$Mm
methylProb <- aln$`III:285000-300000`$tag$Ml
# methylation probabilities do seem to be very much read dependent
rafalib::mypar(mfrow=c(4,4))
hlp <- lapply(methylProb[1:112], hist, breaks=40, xlim=c(0,255))
# See https://github.com/jkbonfield/hts-specs/blob/methylation/SAMtags.tex
# for explanation on how the Mm and Ml tags work.
posProbList <- list()
for(rr in 1:length(aln$`III:285000-300000`$seq)){
# loop over each read
curSeq <- aln$`III:285000-300000`$seq[rr][[1]]
curStart <- aln$`III:285000-300000`$pos[rr]
curTag <- aln$`III:285000-300000`$tag$Mm[rr]
curProb <- aln$`III:285000-300000`$tag$Ml[[rr]]
# extract A bases and corresponding positions
# start at starting position, and total number of positions equal total number of As in sequence.
posA <- curStart + which(isMatchingAt("A", curSeq, at = 1:length(curSeq))) - 1
# get at which As are considered possibly methylated
tagSplit <- gsub(pattern=";", x=strsplit(curTag, split=",")[[1]], replacement="")
stopifnot(tagSplit[1] == "A+a") # check if we're looking at m6A
tagSplit <- as.numeric(tagSplit[-1])
# number of As considered for methylation should be equal to length of probability vector
stopifnot((sum(tagSplit == 0) + sum(tagSplit > 0)) == length(curProb))
#get pos and prob of As considered potentially methylated
posMet <- posA[cumsum(tagSplit+1)]
probMet <- curProb / 255
df <- data.frame(pos = posMet,
prob = probMet)
# #not considered methylated: doesn't work yet
# posNull <- posA[-cumsum(tagSplit+1)]
# probNull <- rep(0, length(posNull))
# df <- data.frame(pos = c(posMet, posNull),
# prob = c(probMet, probNull)) # positions and methyl probs
posProbList[[rr]] <- df
}
rafalib::mypar(mfrow=c(1,1))
pal <- wesanderson::wes_palette("Zissou1", n=256, type="continuous")
par(bty="l")
plot(x=seq(285e3, 300e3, length=length(posProbList)),
y=1:length(posProbList),
type = "n",
xlab = "Position on ChrIII",
ylab= "Read colored by methylation probability")
for(ii in 1:length(posProbList)){
curPos <- posProbList[[ii]]$pos
points(x=curPos, y=rep(ii, length(curPos)), col=pal[posProbList[[ii]]$prob*255],
pch=16, cex=1/20)
}
abline(v=292e3, lty=2)
abline(v=295e3, lty=2)
for(tt in 0:7){
binaryPal <- wesanderson::wes_palette("Zissou1", n=2, type="continuous")
rafalib::mypar(mfrow=c(1,1))
#pal <- wesanderson::wes_palette("Zissou1", n=256, type="continuous")
par(bty="l")
plot(x=seq(285e3, 300e3, length=length(posProbList)),
y=1:length(posProbList),
type = "n",
xlab = "Position on ChrIII",
ylab= "Read colored by methylation probability",
main = paste0("Lower: ",(0.4-(tt*0.05))*100,"%, upper: ", (0.6+(tt*0.05))*100,"%"))
for(ii in 1:length(posProbList)){
curDf <- posProbList[[ii]]
curDf <- curDf[curDf$prob >= 0.6+(tt*0.05) | curDf$prob <= 0.4-(tt*0.05),]
curPos <- curDf$pos
curMet <- as.numeric(curDf$prob >= .8)
points(x=curPos, y=rep(ii, length(curPos)), col=binaryPal[curMet+1],
pch=16, cex=1/20)
}
abline(v=292e3, lty=2)
abline(v=295e3, lty=2)
}